from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Downloaded ' + local)

download('https://github.com/AllenDowney/' +
         'ElementsOfDataScience/raw/master/brfss.hdf5')


import pandas as pd

brfss = pd.read_hdf('brfss.hdf5', 'brfss')
brfss.shape

(100000, 9)


brfss.head()


height = brfss['HTM4']
weight = brfss['WTKG3']


import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(height, weight, 'o')

plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


plt.plot(height, weight, 'o', alpha=0.02)

plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


plt.plot(height, weight, 'o', alpha=0.02, markersize=1)

plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


import numpy as np

noise = np.random.normal(0, 2, size=len(brfss))
height_jitter = height + noise


plt.plot(height_jitter, weight, 'o',
         alpha=0.02, markersize=1)

plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


noise = np.random.normal(0, 2, size=len(brfss))
weight_jitter = weight + noise


plt.plot(height_jitter, weight_jitter, 'o',
         alpha=0.02, markersize=1)

plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


plt.plot(height_jitter, weight_jitter, 'o',
         alpha=0.02, markersize=1)

plt.xlim([140, 200])
plt.ylim([0, 160])
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


# Set the figure size
plt.figure(figsize=(8, 3))

# Create subplots with 2 rows, 1 column, and start plot 1
plt.subplot(1, 2, 1)
plt.plot(height, weight, 'o')

plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height')

# Adjust the layout so the two plots don't overlap
plt.tight_layout()

# Start plot 2
plt.subplot(1, 2, 2)

plt.plot(height_jitter, weight_jitter, 'o',
         alpha=0.02, markersize=1)

plt.xlim([140, 200])
plt.ylim([0, 160])
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height')
plt.tight_layout()


try:
    import empiricaldist
except ImportError:
    !pip install empiricaldist


from empiricaldist import Pmf


# Решение здесь


Pmf.from_seq(weight).bar()

plt.xlabel('Weight in kg')
plt.ylabel('PMF')
plt.title('Distribution of weight');


# Решение здесь


# Решение здесь


# Решение здесь


# Решение здесь


# Решение здесь


age = brfss['AGE']
noise = np.random.normal(0, 1.0, size=len(brfss))
age_jitter = age + noise

plt.plot(age_jitter, weight_jitter, 'o',
         alpha=0.01, markersize=1)

plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.ylim([0, 200])
plt.title('Weight versus age');


data = brfss.dropna(subset=['AGE', 'WTKG3'])
data.shape

(92729, 9)


import seaborn as sns

sns.violinplot(x='AGE', y='WTKG3', data=data, inner=None)

plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Weight versus age');


sns.boxplot(x='AGE', y='WTKG3', data=data, whis=10)

plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Weight versus age');


sns.boxplot(x='AGE', y='WTKG3', data=data, whis=10)

plt.yscale('log')
plt.xlabel('Age in years')
plt.ylabel('Weight in kg (log scale)')
plt.title('Weight versus age');


# Решение здесь


# Решение здесь


# Решение здесь


columns = ['HTM4', 'WTKG3', 'AGE']
subset = brfss[columns]


subset.corr()


data = brfss.dropna(subset=['AGE', 'WTKG3'])
sns.boxplot(x='AGE', y='WTKG3', data=data, whis=10)

plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Weight versus age');


xs = np.linspace(-1, 1)
ys = xs**2 + np.random.normal(0, 0.05, len(xs))


plt.plot(xs, ys, 'o', alpha=0.5)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter plot of a fake dataset');


np.corrcoef(xs, ys)

array([[1.        , 0.01135475],
       [0.01135475, 1.        ]])


np.random.seed(18)
xs1 = np.linspace(20, 50)
ys1 = 75 + 0.02 * xs1 + np.random.normal(0, 0.15, len(xs1))

plt.plot(xs1, ys1, 'o', alpha=0.5)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Fake dataset #1');


np.random.seed(18)
xs2 = np.linspace(20, 50)
ys2 = 65 + 0.2 * xs2 + np.random.normal(0, 3, len(xs2))

plt.plot(xs2, ys2, 'o', alpha=0.5)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Fake dataset #2');


rho1 = np.corrcoef(xs1, ys1)[0][1]
rho1

0.7579660563439407


rho2 = np.corrcoef(xs2, ys2)[0][1]
rho2

0.47827769765763184


# Решение здесь


# Решение здесь


# Решение здесь


plt.figure(figsize=(8, 3))

plt.subplot(1, 2, 1)
plt.plot(xs1, ys1, 'o', alpha=0.5)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Fake dataset #1')
plt.tight_layout()

plt.subplot(1, 2, 2)
plt.plot(xs2, ys2, 'o', alpha=0.5)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Fake dataset #2')
plt.tight_layout()


from scipy.stats import linregress

res1 = linregress(xs1, ys1)
res1._asdict()

{'slope': 0.018821034903244396,
 'intercept': 75.08049023710964,
 'rvalue': 0.7579660563439407,
 'pvalue': 1.8470158725245546e-10,
 'stderr': 0.002337849260560816,
 'intercept_stderr': 0.08439154079040351}


res1.slope * 30

0.5646310470973319


res2 = linregress(xs2, ys2)
res2._asdict()

{'slope': 0.17642069806488858,
 'intercept': 66.60980474219305,
 'rvalue': 0.47827769765763184,
 'pvalue': 0.0004430600283776228,
 'stderr': 0.046756985211216295,
 'intercept_stderr': 1.6878308158080693}


res2.slope * 30

5.292620941946657


plt.plot(xs1, ys1, 'o', alpha=0.5)

fx = np.array([xs1.min(), xs1.max()])
fy = res1.intercept + res1.slope * fx
plt.plot(fx, fy, '-')

plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Fake Dataset #1');


plt.plot(xs2, ys2, 'o', alpha=0.5)

fx = np.array([xs2.min(), xs2.max()])
fy = res2.intercept + res2.slope * fx
plt.plot(fx, fy, '-')

plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Fake Dataset #2');


plt.plot(height_jitter, weight_jitter, 'o',
         alpha=0.02, markersize=1)

plt.xlim([140, 200])
plt.ylim([0, 160])
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


subset = brfss.dropna(subset=['WTKG3', 'HTM4'])
height_clean = subset['HTM4']
weight_clean = subset['WTKG3']


res_hw = linregress(height_clean, weight_clean)
res_hw._asdict()

{'slope': 0.9192115381848256,
 'intercept': -75.12704250330165,
 'rvalue': 0.47420308979024434,
 'pvalue': 0.0,
 'stderr': 0.005632863769802997,
 'intercept_stderr': 0.960886026543318}


fx = np.array([height_clean.min(), height_clean.max()])
fy = res_hw.intercept + res_hw.slope * fx


plt.plot(height_jitter, weight_jitter, 'o', alpha=0.02, markersize=1)

plt.plot(fx, fy, '-')

plt.xlim([140, 200])
plt.ylim([0, 160])
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.title('Scatter plot of weight versus height');


plt.plot(age_jitter, weight_jitter, 'o',
         alpha=0.01, markersize=1)

plt.ylim([0, 160])
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Weight versus age');


subset = brfss.dropna(subset=['WTKG3', 'AGE'])
age_clean = subset['AGE']
weight_clean = subset['WTKG3']

res_aw = linregress(age_clean, weight_clean)
res_aw._asdict()

{'slope': 0.023981159566968686,
 'intercept': 80.07977583683224,
 'rvalue': 0.021641432889064033,
 'pvalue': 4.3743274930078674e-11,
 'stderr': 0.003638139410742186,
 'intercept_stderr': 0.18688508176870167}


plt.plot(age_jitter, weight_jitter, 'o',
         alpha=0.01, markersize=1)

fx = np.array([age_clean.min(), age_clean.max()])
fy = res_aw.intercept + res_aw.slope * fx
plt.plot(fx, fy, '-')

plt.ylim([0, 160])
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.title('Weight versus age');


# Решение здесь


# Решение здесь


# Решение здесь


# Решение здесь


# Решение здесь

	SEX	HTM4	WTKG3	INCOME2	_LLCPWT	_AGEG5YR	_VEGESU1	_HTMG10	AGE
96230	2.0	160.0	60.33	8.0	1398.525290	6.0	2.14	150.0	47.0
244920	2.0	163.0	58.97	5.0	84.057503	13.0	3.14	160.0	89.5
57312	2.0	163.0	72.57	8.0	390.248599	5.0	2.64	160.0	42.0
32573	2.0	165.0	74.84	1.0	11566.705300	3.0	1.46	160.0	32.0
355929	2.0	170.0	108.86	3.0	844.485450	3.0	1.81	160.0	32.0

Исследуем отношение между переменными¶

Изучение отношений¶

Визуализация отношений¶

Корреляция¶

Простая регрессия¶

Рост и вес¶

	HTM4	WTKG3	AGE
HTM4	1.000000	0.474203	-0.093684
WTKG3	0.474203	1.000000	0.021641
AGE	-0.093684	0.021641	1.000000